Goal Intensities

Try to model goal frequencies according to Poisson processes based on team's ranks.



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [12]:

    
# Get World Cup data
import json

with open('data/all_matches.json', 'r') as f:
    all_match_data = json.load(f)



In [13]:

    
all_match_data[0]









    Out[13]:





{u'away_team': {u'code': u'CRO', u'country': u'Croatia', u'goals': 1},
 u'away_team_events': [{u'id': 677,
   u'player': u'Brozovi\u0106',
   u'time': u'61',
   u'type_of_event': u'substitution-in'},
  {u'id': 674,
   u'player': u'Corluka',
   u'time': u'66',
   u'type_of_event': u'yellow-card'},
  {u'id': 675,
   u'player': u'Lovren',
   u'time': u'69',
   u'type_of_event': u'yellow-card'},
  {u'id': 676,
   u'player': u'Rebi\u0106',
   u'time': u'78',
   u'type_of_event': u'substitution-in'}],
 u'datetime': u'2014-06-12T17:00:00.000-03:00',
 u'home_team': {u'code': u'BRA', u'country': u'Brazil', u'goals': 3},
 u'home_team_events': [{u'id': 662,
   u'player': u'Marcelo',
   u'time': u'11',
   u'type_of_event': u'goal-own'},
  {u'id': 665,
   u'player': u'Neymar Jr',
   u'time': u'27',
   u'type_of_event': u'yellow-card'},
  {u'id': 666,
   u'player': u'Neymar Jr',
   u'time': u'29',
   u'type_of_event': u'goal'},
  {u'id': 664,
   u'player': u'Paulinho',
   u'time': u'63',
   u'type_of_event': u'substitution-out'},
  {u'id': 672,
   u'player': u'Hernanes',
   u'time': u'63',
   u'type_of_event': u'substitution-in'},
  {u'id': 663,
   u'player': u'Hulk',
   u'time': u'68',
   u'type_of_event': u'substitution-out'},
  {u'id': 673,
   u'player': u'Bernard',
   u'time': u'68',
   u'type_of_event': u'substitution-in'},
  {u'id': 667,
   u'player': u'Neymar Jr',
   u'time': u'71',
   u'type_of_event': u'goal-penalty'},
  {u'id': 670,
   u'player': u'L Gustavo',
   u'time': u'88',
   u'type_of_event': u'yellow-card'},
  {u'id': 668,
   u'player': u'Neymar Jr',
   u'time': u'88',
   u'type_of_event': u'substitution-out'},
  {u'id': 671,
   u'player': u'Ramires',
   u'time': u'88',
   u'type_of_event': u'substitution-in'},
  {u'id': 669,
   u'player': u'Oscar',
   u'time': u'901',
   u'type_of_event': u'goal'}],
 u'location': u'Arena de Sao Paulo',
 u'match_number': 1,
 u'status': u'completed',
 u'winner': u'Brazil',
 u'winner_code': u'BRA'}



In [25]:

    
# Load into a pandas dataframe
import pandas as pd

# first make a simpler dataset
home_simple = [dict(game['home_team'].items() + {'home':1}.items()) for i, game in enumerate(all_match_data)]
away_simple = [dict(game['away_team'].items() + {'home':0}.items()) for j, game in enumerate(all_match_data)]
simple = home_simple + away_simple
simple_alt = away_simple + home_simple



In [31]:

    
# Old method with 
temp_df = pd.DataFrame(simple)
matches_df = pd.DataFrame(simple_alt).join(temp_df, lsuffix='_1', rsuffix='_2').fillna(0)

matches_df.head()









    Out[31]:






  
    
      
      code_1
      country_1
      goals_1
      home_1
      penalties_1
      code_2
      country_2
      goals_2
      home_2
      penalties_2
    
  
  
    
      0
       CRO
           Croatia
       1
       0
       0
       BRA
         Brazil
       3
       1
       0
    
    
      1
       CMR
          Cameroon
       0
       0
       0
       MEX
         Mexico
       1
       1
       0
    
    
      2
       NED
       Netherlands
       5
       0
       0
       ESP
          Spain
       1
       1
       0
    
    
      3
       AUS
         Australia
       1
       0
       0
       CHI
          Chile
       3
       1
       0
    
    
      4
       GRE
            Greece
       0
       0
       0
       COL
       Colombia
       3
       1
       0
    
  

5 rows × 10 columns



In [32]:

    
matches_df[64:69]









    Out[32]:






  
    
      
      code_1
      country_1
      goals_1
      home_1
      penalties_1
      code_2
      country_2
      goals_2
      home_2
      penalties_2
    
  
  
    
      64
       BRA
         Brazil
       3
       1
       0
       CRO
           Croatia
       1
       0
       0
    
    
      65
       MEX
         Mexico
       1
       1
       0
       CMR
          Cameroon
       0
       0
       0
    
    
      66
       ESP
          Spain
       1
       1
       0
       NED
       Netherlands
       5
       0
       0
    
    
      67
       CHI
          Chile
       3
       1
       0
       AUS
         Australia
       1
       0
       0
    
    
      68
       COL
       Colombia
       3
       1
       0
       GRE
            Greece
       0
       0
       0
    
  

5 rows × 10 columns



In [33]:

    
matches_df.describe()









    Out[33]:






  
    
      
      goals_1
      home_1
      penalties_1
      goals_2
      home_2
      penalties_2
    
  
  
    
      count
       128.000000
       128.000000
       128.000000
       128.000000
       128.000000
       128.000000
    
    
      mean
         1.328125
         0.500000
         0.203125
         1.328125
         0.500000
         0.203125
    
    
      std
         1.292677
         0.501965
         0.826332
         1.292677
         0.501965
         0.826332
    
    
      min
         0.000000
         0.000000
         0.000000
         0.000000
         0.000000
         0.000000
    
    
      25%
         0.000000
         0.000000
         0.000000
         0.000000
         0.000000
         0.000000
    
    
      50%
         1.000000
         0.500000
         0.000000
         1.000000
         0.500000
         0.000000
    
    
      75%
         2.000000
         1.000000
         0.000000
         2.000000
         1.000000
         0.000000
    
    
      max
         7.000000
         1.000000
         5.000000
         7.000000
         1.000000
         5.000000
    
  

8 rows × 6 columns



In [39]:

    
# Let's look at a histogram of values for goals
matches_df['goals_1'].hist(bins=8)
show()



In [52]:

    
# Clearly teams usually score once.
# If we simply fit a poisson distribution to this data, what value would we use?
# We'll answer this question by seeing which Poisson distribution best fits the data
from scipy.stats import poisson

# def nllf(l):
#     result = 0.
#     for goal in matches_df['goals_1']:
#         result += poisson(l).logpmf(goal)
#     return -result

# One liner
nllf = lambda l: -poisson(exp(l)).logpmf(matches_df['goals_1']).sum()

nllf(1)









    Out[52]:





252.1465755469828



In [53]:

    
# Optimize!
from scipy.optimize import minimize

result = minimize(nllf, [0])
result









    Out[53]:





   status: 0
  success: True
     njev: 9
     nfev: 27
 hess_inv: array([[ 0.00587375]])
      fun: 195.96591207201465
        x: array([ 0.28376822])
  message: 'Optimization terminated successfully.'
      jac: array([  3.81469727e-06])



In [54]:

    
simple_rate = exp(result.x)
simple_rate









    Out[54]:





array([ 1.32812506])



In [68]:

    
goal_hist = matches_df['goals_1'].value_counts().to_dict()



In [80]:

    
# See how this overlays

x = array(range(8))
width = 0.33
bar(x, [goal_hist.get(i, 0) for i in x], width, alpha=0.5, label='Actual')
bar(x + width, poisson(simple_rate).pmf(x) * len(matches_df['goals_1']),
    width, color='r', alpha=0.5, label='Simple Poisson')
legend(loc='best')
show()

	code_1	country_1	goals_1	code_2	country_2	goals_2	home_2
0	CRO	Croatia	1	BRA	Brazil	3	1
1	CMR	Cameroon	0	MEX	Mexico	1	1
2	NED	Netherlands	5	ESP	Spain	1	1
3	AUS	Australia	1	CHI	Chile	3	1
4	GRE	Greece	0	COL	Colombia	3	1

	goals_1	home_1	penalties_1	goals_2	home_2	penalties_2
count	128.000000	128.000000	128.000000	128.000000	128.000000	128.000000
mean	1.328125	0.500000	0.203125	1.328125	0.500000	0.203125
std	1.292677	0.501965	0.826332	1.292677	0.501965	0.826332
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.500000	0.000000	1.000000	0.500000	0.000000
75%	2.000000	1.000000	0.000000	2.000000	1.000000	0.000000
max	7.000000	1.000000	5.000000	7.000000	1.000000	5.000000